{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH\n",
    "sys.path.append('/home/mink/lib/ai4eutils')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from collections import defaultdict\n",
    "from random import sample\n",
    "from shutil import copyfile\n",
    "from datetime import datetime, date\n",
    "from tqdm import tqdm\n",
    "from unidecode import unidecode \n",
    "\n",
    "# ai4eutils\n",
    "import path_utils\n",
    "\n",
    "from data_management.megadb.schema import sequences_schema_check\n",
    "from data_management.megadb.converters.cct_to_megadb import make_cct_embedded, process_sequences, write_json"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# alka_squirrels\n",
    "\n",
    "The ideal dataset has both **location** and **sequence** information, in addition to any species or bounding box labels."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Give the path to a JSON file where output from this script will be written to. You can then take this file to the .Net app for ingestion to the database."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_to_output = '/home/mink/camtraps/data/megadb_jsons/alka_squirrels.json' \n",
    "path_to_output_temp = '/home/mink/camtraps/data/megadb_jsons/alka_squirrels_temp.json' "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name = 'alka_squirrels'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 0 - Add an entry to the `datasets` table\n",
    "\n",
    "Done"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 1 - Prepare the `sequence` objects to insert into the database"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 1b - If you're starting from scratch...\n",
    "\n",
    "i.e. starting from metadata in another format, e.g. spreadsheets, CSVs, \n",
    "\n",
    "you would need to compile the metadata into a list of `sequence` items - format specified at https://github.com/agentmorris/MegaDetector/tree/master/data_management/megadb.\n",
    "\n",
    "Call the result `sequences`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['2', 'tagged', 'ALKA2018_cameratrap_miroslav', '1', 'annotations']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# all content in the container was first AzCopied to mink_disk_0\n",
    "container_root = '/mink_disk_0/camtraps/alka_squirrels/alka/'\n",
    "os.listdir(container_root)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "folder1 = '/mink_disk_0/camtraps/alka_squirrels/alka/1'\n",
    "folder2 = '/mink_disk_0/camtraps/alka_squirrels/alka/2'\n",
    "folder3 = '/mink_disk_0/camtraps/alka_squirrels/alka/ALKA2018_cameratrap_miroslav'\n",
    "folder4 = '/mink_disk_0/camtraps/alka_squirrels/alka/tagged'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'JPG', 'jpg'}"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "extensions = set([p.split('.')[-1] \n",
    "                  for p in path_utils.recursive_file_list(container_root, convert_slashes=False) \n",
    "                  if path_utils.is_image_file(p)])\n",
    "extensions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Folders 1 and 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7805"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "34873"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "paths12 = path_utils.recursive_file_list(folder1, convert_slashes=False)\n",
    "len(paths12)\n",
    "paths12.extend(path_utils.recursive_file_list(folder2, convert_slashes=False))\n",
    "len(paths12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 34873/34873 [00:00<00:00, 166282.19it/s]\n"
     ]
    }
   ],
   "source": [
    "seq_dict = defaultdict(list)\n",
    "\n",
    "for p in tqdm(paths12):\n",
    "    \n",
    "    fn = os.path.basename(p).split('.')[0]\n",
    "    frame = fn.split('C')[1]\n",
    "    seq_id = p.split('/')[-2].replace(' ', '_')  # same as location\n",
    "    \n",
    "    parts = fn.split(' ')\n",
    "    year = int(parts[0])\n",
    "    month = int(parts[1])\n",
    "    day = int(parts[2])\n",
    "    \n",
    "    seq_dict[seq_id].append({\n",
    "        'file': p.split(container_root)[1],\n",
    "        'class': ['unlabeled'],\n",
    "        'frame_num': int(frame),\n",
    "        'datetime': str(date(year=year, month=month, day=day))\n",
    "    })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['2020_05_07_54C', '2020_05_07_50C', '2020_05_08_32C', '2020_05_07_48C', '2020_05_08_45C', '2020_05_08_35C', '2020_05_08_38C', '2020_05_08_25C', '2020_05_08_39C', '2020_05_08_42C', '2020_05_08_30C', '2020_05_07_47C', '2020_05_07_35C', '2020_05_07_25C', '2020_05_07_30C', '2020_05_07_32C'])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "seq_dict.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'file': '2/2020 05 08 42C/2020 05 08 42C102621.JPG',\n",
       " 'class': ['unlabeled'],\n",
       " 'frame_num': 102621,\n",
       " 'datetime': '2020-05-08'}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "seq_dict['2020_05_08_42C'][-1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Folder ALKA2018_cameratrap_miroslav"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "seq_id = 'ALKA2018_miroslav'\n",
    "seq_dict[seq_id] = []\n",
    "\n",
    "for fn in os.listdir(folder3):\n",
    "    if path_utils.is_image_file(fn):\n",
    "    \n",
    "        p = os.path.join(folder3, fn)\n",
    "        frame = fn.split('.')[0].split('__')[1]\n",
    "        \n",
    "        seq_dict[seq_id].append({\n",
    "            'file': p.split(container_root)[1],\n",
    "            'class': ['unlabeled'],\n",
    "            'frame_num': int(frame),\n",
    "            'datetime': str(date(year=int(fn[:4]), month=int(fn[4:6]), day=int(fn[6:8])))\n",
    "        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'file': 'ALKA2018_cameratrap_miroslav/20180608NBR10L__01447.JPG',\n",
       " 'class': ['unlabeled'],\n",
       " 'frame_num': 1447,\n",
       " 'datetime': '2018-06-08'}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "seq_dict['ALKA2018_miroslav'][-10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Folder tagged\n",
    "\n",
    "Don't re-run the cell below"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "seq_id_prefix = 'tagged'\n",
    "\n",
    "for fn in os.listdir(folder4):\n",
    "    if path_utils.is_image_file(fn):\n",
    "    \n",
    "        p = os.path.join(folder4, fn)\n",
    "        name = fn.split('.')[0]\n",
    "        \n",
    "        seq_id = seq_id_prefix + '_' + name.split('_')[0]\n",
    "        frame = name.split('_')[1]\n",
    "        \n",
    "        seq_dict[seq_id].append({\n",
    "            'file': p.split(container_root)[1],\n",
    "            'class': ['unlabeled'],\n",
    "            'frame_num': int(frame),\n",
    "            'datetime': str(date(year=2018, month=6, day=7))  # seems to be all from this day\n",
    "        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'file': 'tagged/9_2461.jpg',\n",
       " 'class': ['unlabeled'],\n",
       " 'frame_num': 2461,\n",
       " 'datetime': '2018-06-07'}"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "seq_dict['tagged_9'][-10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Combine folders"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "27"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "37449"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequences = []\n",
    "num_images = 0\n",
    "\n",
    "for seq_id, seq_images in seq_dict.items():\n",
    "    sequences.append({\n",
    "        'dataset': dataset_name,\n",
    "        'seq_id': seq_id,\n",
    "        'location': seq_id,\n",
    "        'images': seq_images\n",
    "    })\n",
    "    num_images += len(seq_images)\n",
    "\n",
    "len(sequences)\n",
    "num_images"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 2 - Pass the schema check\n",
    "\n",
    "Once your metadata are in the MegaDB format for `sequence` items, we check that they conform to the format's schema.\n",
    "\n",
    "If the format conforms, the following messages will be printed:\n",
    "\n",
    "```\n",
    "Verified that the sequence items meet requirements not captured by the schema.\n",
    "Verified that the sequence items conform to the schema.\n",
    "```\n",
    "\n",
    "For large datasets, the second step will take some time (~ a minute). \n",
    "\n",
    "Otherwise there will be an error message describing what's wrong. Please fix the issues until all checks are passed. You might need to write some snippets of code to loop through the `sequence` items to understand which entries have problems."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Verified that the sequence items meet requirements not captured by the schema.\n",
      "Verified that the sequence items conform to the schema.\n",
      "CPU times: user 1.9 s, sys: 0 ns, total: 1.9 s\n",
      "Wall time: 1.9 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "sequences_schema_check.sequences_schema_check(sequences)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Save the entries while we wait for the annotations to come back."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path_to_output_temp, 'w', encoding='utf-8') as f:\n",
    "    json.dump(sequences, f, indent=1, ensure_ascii=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2b - copy images to flat folder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 27/27 [1:24:56<00:00, 188.76s/it]  \n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "37449"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dst_paths = []\n",
    "\n",
    "for seq in tqdm(sequences):\n",
    "    seq_id = seq['seq_id']\n",
    "    for im in seq['images']:\n",
    "        src_path = os.path.join(container_root, im['file'])\n",
    "        assert os.path.exists(src_path), src_path\n",
    "        frame = im['frame_num']\n",
    "        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12', f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')\n",
    "        \n",
    "        dst_paths.append(copyfile(src_path, dst_path))\n",
    "        \n",
    "len(dst_paths)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:cameratraps] *",
   "language": "python",
   "name": "conda-env-cameratraps-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
